In [1]:
#practice collecting data from imdb
#Code below finds all the variables we want about actors and outputs them to a file handle we're calling fout
#note: indentation is a double-space

#first, import some handy packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time
import random
import re
import csv

In [2]:
#setup three output files (fbak for storing websites collected, fout for storing data scraped, fout2 for recording pages scraped)
fbak = open('src-pgs.txt', 'wb')
fout = open('imdb-recs.txt', 'w', newline='')
fout2 = open('imdb-pgs.txt', 'w')
#create a handle to write tab-delimited data to the file fout
writer = csv.writer(fout, delimiter='\t')
#send the header row to this handle
writer.writerow(['movie','movie_url','actor','actor_url','role','role_url','year'])

52

In [3]:
#in the development stage, first copy html into a text file and read it in from there
f = open('moviesrc.txt', 'r'); html = f.read()

#after perfecting your program, read in html directly from website
#html = urlopen('http://www.imdb.com/title/tt0451279/').read(); fbak.write(html);
#time.sleep(random.randint(4, 10));
#note: sleeping is essential to avoid annoying web servers by calling them too often!

In [4]:
#convert the html into a format that recognizes the structure of the html (and put it into a variable we're calling soup)
soup = BeautifulSoup(html, "html.parser")

In [5]:
#some examples of the power of beautiful soup
#see https://www.crummy.com/software/BeautifulSoup/bs4/doc/ for full details

#this snippet prints all the links on the page
#for link in soup.find_all('a'):
#  print(link.get('href'))

#and this one prints all the text (note the encoding method is used to avoid errors from unicode characters)
#txt = soup.get_text(); txt = txt.encode('utf-8', 'ignore'); txt = txt.decode('utf-8'); print(txt)

In [6]:
#get movie title, which is string stored under html 'title' tag, & strip out any whitespace at the start/end; then clean out IMDB text
movie = soup.title.string.strip()
movie = movie.encode('utf-8', 'ignore'); movie = movie.decode('utf-8')
movie = re.sub(' - IMDB', '', movie, flags=re.IGNORECASE)

In [7]:
#get movie year, which is part of the title string, using regex
m = re.search('(\d\d\d\d)', movie)
if m:
  year = m.group(1); year=int(year)
else:
  year = ''

In [8]:
#get movie url
tag = soup.find("link", rel="canonical"); movie_url = tag["href"]

In [9]:
#find the table that contains the cast list
table = soup.find("table", class_="cast_list")
#find all the rows from the table and create a row-counter
rows = table.find_all("tr"); rowct=0
print(len(rows),"rows in the table")
print("rows are stored in a variable w/ data type:",type(rows))

16 rows in the table
rows are stored in a variable w/ data type: <class 'bs4.element.ResultSet'>


In [10]:
#iterate thru rows of the table
for row in rows:
  #turn the next line one to print each row
  #print row
  #find all the columns in the row
  cols = row.find_all("td")
  #skip this row if it doesn't contain more than one column
  if not (len(cols)>1): continue
  #create a list to hold our data
  rec = []
  #add movie title & url to our list, which we retrieved earlier
  rec.append(movie); rec.append(movie_url)
  #increment row-counter
  rowct+=1
  #iterate thru columns to store actors and roles
  for col in cols:
    #get link stored within the cell (html "a" tag)
    link = col.find("a")
    #try/except allows python to skip a sequence if it doesn't work, rather than fail
    try:
      #get the text stored under the link
      txt = link.text
      #proceed if text contains alphanumeric values (note: uses regex package called re)
      if re.search('[a-zA-Z0-9]', txt):
        #process the text to strip out any whitespace at the start/end, encode it to resolve any unicode issues, and add it to our list
        txt = txt.strip(); txt = txt.encode('utf-8', 'ignore'); txt = txt.decode('utf-8'); rec.append(txt)
        #get url from link, which is the value stored under the key 'href', clean it up, & add to the end of our list
        url = link["href"]; url = re.sub('\?ref.+','',url); url='http://www.imdb.com' + url; rec.append(url)
    except:
      pass
  #add the year to our list
  rec.append(year)
  #send our list to the output handle
  writer.writerow(rec)

In [11]:
#report how many good rows were found
print('parsed',movie)
print('actors found:',rowct)
#output url of page we have scraped
fout2.write(movie_url + "\n")
#close file handles
fbak.close(); fout.close(); fout2.close()

parsed Wonder Woman (2017)
actors found: 15
